import pandas as pd,numpy as np,matplotlib.pyplot as plt,seaborn as sns
df=pd.read_csv(r"C:\Users\alasy\OneDrive\Desktop\final_df_olist1.csv")
df
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\334602695.py:1: DtypeWarning: Columns (11,12,13,14) have mixed types. Specify dtype option on import or set low_memory=False. df=pd.read_csv(r"C:\Users\alasy\OneDrive\Desktop\final_df_olist1.csv")
| Unnamed: 0 | order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_name_lenght | product_description_lenght | ... | payment_type | payment_installments | payment_value | product_category_name_english | seller_lat | seller_lng | customer_lat | customer_lng | product_category | cust_Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 | 58.0 | 598.0 | ... | credit_card | 2.0 | 72.19 | cool_stuff | -22.497188 | -44.127324 | -21.763186 | -41.310265 | Fashion | Southeast |
| 1 | 1 | 5c94ad4e194c0e6794688a9d2b9ea94b | 1.0 | e0cf79767c5b016251fe139915c59a26 | da8622b14eb17ae2831f4ac5b9dab84a | 2018-03-16 18:50:24 | 29.90 | 18.23 | 55.0 | 388.0 | ... | boleto | 1.0 | 48.13 | health_beauty | -22.708485 | -47.664918 | -21.763186 | -41.310265 | Health and Beauty | Southeast |
| 2 | 2 | bd31b009e1dbc47fc7c250b1e2cf5440 | 1.0 | 92e2d2146e433cd4d1f09a3f8633ead0 | 4a3ca9315b744ce9f8e9374361493884 | 2018-05-02 03:51:18 | 52.90 | 22.95 | 53.0 | 714.0 | ... | boleto | 1.0 | 75.85 | bed_bath_table | -21.757225 | -48.829541 | -21.763186 | -41.310265 | Furniture | Southeast |
| 3 | 3 | bb4eb0196897c20281a61f75ce23211c | 1.0 | 8cee1d824765335c48ccc515c2ecf4b8 | 4a3ca9315b744ce9f8e9374361493884 | 2017-07-28 14:50:11 | 96.00 | 15.42 | 54.0 | 385.0 | ... | credit_card | 9.0 | 111.42 | bed_bath_table | -21.757225 | -48.829541 | -21.763186 | -41.310265 | Furniture | Southeast |
| 4 | 4 | f9847bf9cc7336c6ba07fe2bdbb6cae1 | 1.0 | 3e5201fe0d1ba474d9b90152c83c706c | 8160255418d5aaa7dbdc9f4c64ebda44 | 2018-05-16 18:30:50 | 125.90 | 13.30 | 56.0 | 255.0 | ... | credit_card | 6.0 | 139.20 | bed_bath_table | -21.757225 | -48.829541 | -21.763186 | -41.310265 | Furniture | Southeast |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112633 | 117720 | bb05bd3bbacf1e3c6026b43b44a6631c | 1.0 | a0425426bd812474106a782b2979e310 | eb1bf309f4f6af9a97e9ddac3fdbeabe | 2018-05-29 02:55:03 | 22.30 | 8.88 | 57.0 | 1417.0 | ... | boleto | 1.0 | 31.18 | sports_leisure | -23.702611 | -46.596395 | -23.570953 | -46.624567 | Entertainment | Southeast |
| 112634 | 117721 | c003c1face3c31cba0af4ab5314f3332 | 1.0 | dedb5adefe63796ce6edd624d0ebd8a8 | 70ee69419916dc13e3aed619061f996c | 2017-08-18 11:05:09 | 45.00 | 8.72 | 43.0 | 642.0 | ... | credit_card | 5.0 | 53.72 | furniture_decor | -23.520595 | -46.178266 | -23.456257 | -46.937905 | Furniture | Southeast |
| 112635 | 117725 | cec79ef92819a9f1f2009e1d8cf26db8 | 1.0 | ae8a92195eb75e94998a210eeb51693e | 1d139e3a3b14025640d8df1b230aace0 | 2017-09-19 12:35:16 | 7.48 | 15.10 | 43.0 | 513.0 | ... | credit_card | 2.0 | 22.58 | sports_leisure | -21.190973 | -48.156353 | -17.086066 | -42.255281 | Entertainment | Southeast |
| 112636 | 117726 | e9613fa6e02ff8ae45f37ca30b0468a1 | 1.0 | f619f125af192e28da946f3382e783a6 | 9b00cad94ef3078faf6fba2e792c158f | 2017-12-27 12:17:28 | 354.90 | 18.92 | 53.0 | 234.0 | ... | voucher | 1.0 | 250.00 | sports_leisure | -15.801399 | -43.310325 | -27.101374 | -51.246351 | Entertainment | South |
| 112637 | 117727 | e9613fa6e02ff8ae45f37ca30b0468a1 | 1.0 | f619f125af192e28da946f3382e783a6 | 9b00cad94ef3078faf6fba2e792c158f | 2017-12-27 12:17:28 | 354.90 | 18.92 | 53.0 | 234.0 | ... | credit_card | 5.0 | 123.82 | sports_leisure | -15.801399 | -43.310325 | -27.101374 | -51.246351 | Entertainment | South |
112638 rows × 46 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112638 entries, 0 to 112637 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 112638 non-null int64 1 order_id 112638 non-null object 2 order_item_id 112638 non-null float64 3 product_id 112638 non-null object 4 seller_id 112638 non-null object 5 shipping_limit_date 112638 non-null object 6 price 112638 non-null float64 7 freight_value 112638 non-null float64 8 product_name_lenght 112638 non-null float64 9 product_description_lenght 112638 non-null float64 10 product_photos_qty 112638 non-null float64 11 product_weight_g 112638 non-null object 12 product_length_cm 112638 non-null object 13 product_height_cm 112638 non-null object 14 product_width_cm 112638 non-null object 15 seller_zip_code_prefix 112638 non-null float64 16 seller_city 112638 non-null object 17 seller_state 112638 non-null object 18 customer_id 112638 non-null object 19 order_status 112638 non-null object 20 order_purchase_timestamp 112638 non-null object 21 order_approved_at 112638 non-null object 22 order_delivered_carrier_date 112638 non-null object 23 order_delivered_customer_date 112638 non-null object 24 order_estimated_delivery_date 112638 non-null object 25 customer_unique_id 112638 non-null object 26 customer_zip_code_prefix 112638 non-null int64 27 customer_city 112638 non-null object 28 customer_state 112638 non-null object 29 review_id 112638 non-null object 30 review_score 112638 non-null float64 31 review_comment_title 112638 non-null object 32 review_comment_message 112638 non-null object 33 review_creation_date 112638 non-null object 34 review_answer_timestamp 112638 non-null object 35 payment_sequential 112638 non-null float64 36 payment_type 112638 non-null object 37 payment_installments 112638 non-null float64 38 payment_value 112638 non-null float64 39 product_category_name_english 112638 non-null object 40 seller_lat 112638 non-null float64 41 seller_lng 112638 non-null float64 42 customer_lat 112638 non-null float64 43 customer_lng 112638 non-null float64 44 product_category 112638 non-null object 45 cust_Region 112638 non-null object dtypes: float64(15), int64(2), object(29) memory usage: 39.5+ MB
df=df.drop(columns='Unnamed: 0',axis=1)
df.describe()
| order_item_id | price | freight_value | product_name_lenght | product_description_lenght | product_photos_qty | seller_zip_code_prefix | customer_zip_code_prefix | review_score | payment_sequential | payment_installments | payment_value | seller_lat | seller_lng | customer_lat | customer_lng | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 | 112638.000000 |
| mean | 1.195289 | 119.890695 | 19.994579 | 48.790453 | 784.760658 | 2.201673 | 24534.728049 | 35009.119418 | 4.082281 | 1.091106 | 2.941929 | 171.624030 | -22.793489 | -47.249482 | -21.244659 | -46.205792 |
| std | 0.688433 | 181.346106 | 15.724898 | 10.020038 | 650.963105 | 1.713659 | 27658.740625 | 29856.092601 | 1.346116 | 0.688239 | 2.778200 | 264.617854 | 2.703788 | 2.345126 | 5.555091 | 4.039370 |
| min | 1.000000 | 0.850000 | 0.000000 | 5.000000 | 4.000000 | 1.000000 | 1001.000000 | 1003.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | -32.075303 | -63.893789 | -33.689890 | -72.668821 |
| 25% | 1.000000 | 39.900000 | 13.072500 | 42.000000 | 345.000000 | 1.000000 | 6429.000000 | 11095.000000 | 4.000000 | 1.000000 | 1.000000 | 60.772500 | -23.609666 | -48.829541 | -23.591175 | -48.109939 |
| 50% | 1.000000 | 74.900000 | 16.310000 | 52.000000 | 600.000000 | 1.000000 | 13690.000000 | 24230.000000 | 5.000000 | 1.000000 | 2.000000 | 107.900000 | -23.424361 | -46.756262 | -22.929004 | -46.632945 |
| 75% | 1.000000 | 133.000000 | 21.180000 | 57.000000 | 982.000000 | 3.000000 | 28035.000000 | 58400.000000 | 5.000000 | 1.000000 | 4.000000 | 189.157500 | -21.757225 | -46.522432 | -20.197823 | -43.658299 |
| max | 21.000000 | 6735.000000 | 409.680000 | 76.000000 | 3992.000000 | 20.000000 | 99730.000000 | 99980.000000 | 5.000000 | 26.000000 | 24.000000 | 13664.080000 | -2.503158 | -34.855831 | 42.184003 | -8.723762 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112638 entries, 0 to 112637 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112638 non-null object 1 order_item_id 112638 non-null float64 2 product_id 112638 non-null object 3 seller_id 112638 non-null object 4 shipping_limit_date 112638 non-null object 5 price 112638 non-null float64 6 freight_value 112638 non-null float64 7 product_name_lenght 112638 non-null float64 8 product_description_lenght 112638 non-null float64 9 product_photos_qty 112638 non-null float64 10 product_weight_g 112638 non-null object 11 product_length_cm 112638 non-null object 12 product_height_cm 112638 non-null object 13 product_width_cm 112638 non-null object 14 seller_zip_code_prefix 112638 non-null float64 15 seller_city 112638 non-null object 16 seller_state 112638 non-null object 17 customer_id 112638 non-null object 18 order_status 112638 non-null object 19 order_purchase_timestamp 112638 non-null object 20 order_approved_at 112638 non-null object 21 order_delivered_carrier_date 112638 non-null object 22 order_delivered_customer_date 112638 non-null object 23 order_estimated_delivery_date 112638 non-null object 24 customer_unique_id 112638 non-null object 25 customer_zip_code_prefix 112638 non-null int64 26 customer_city 112638 non-null object 27 customer_state 112638 non-null object 28 review_id 112638 non-null object 29 review_score 112638 non-null float64 30 review_comment_title 112638 non-null object 31 review_comment_message 112638 non-null object 32 review_creation_date 112638 non-null object 33 review_answer_timestamp 112638 non-null object 34 payment_sequential 112638 non-null float64 35 payment_type 112638 non-null object 36 payment_installments 112638 non-null float64 37 payment_value 112638 non-null float64 38 product_category_name_english 112638 non-null object 39 seller_lat 112638 non-null float64 40 seller_lng 112638 non-null float64 41 customer_lat 112638 non-null float64 42 customer_lng 112638 non-null float64 43 product_category 112638 non-null object 44 cust_Region 112638 non-null object dtypes: float64(15), int64(1), object(29) memory usage: 38.7+ MB
df.isnull().sum()
order_id 0 order_item_id 0 product_id 0 seller_id 0 shipping_limit_date 0 price 0 freight_value 0 product_name_lenght 0 product_description_lenght 0 product_photos_qty 0 product_weight_g 0 product_length_cm 0 product_height_cm 0 product_width_cm 0 seller_zip_code_prefix 0 seller_city 0 seller_state 0 customer_id 0 order_status 0 order_purchase_timestamp 0 order_approved_at 0 order_delivered_carrier_date 0 order_delivered_customer_date 0 order_estimated_delivery_date 0 customer_unique_id 0 customer_zip_code_prefix 0 customer_city 0 customer_state 0 review_id 0 review_score 0 review_comment_title 0 review_comment_message 0 review_creation_date 0 review_answer_timestamp 0 payment_sequential 0 payment_type 0 payment_installments 0 payment_value 0 product_category_name_english 0 seller_lat 0 seller_lng 0 customer_lat 0 customer_lng 0 product_category 0 cust_Region 0 dtype: int64
#plt.figure(figsize=(10,10))
#sns.pairplot(df)
plt.figure(figsize=(10,10))
sns.countplot(x ='customer_state', data=df,order=df['customer_state'].value_counts().sort_values().index)
<AxesSubplot:xlabel='customer_state', ylabel='count'>
from datetime import datetime
df['delivered_time']=pd.to_datetime(df['order_delivered_customer_date'],
format='%Y-%m-%d').dt.date
df['estimate_time']=pd.to_datetime(df['order_estimated_delivery_date'],
format='%Y-%m-%d').dt.date
#Delivered time and Estimate time features created
df['weekly']=pd.to_datetime(df['order_delivered_customer_date'],
format='%Y-%m-%d').dt.week
## created weekly feature based on order delivered customer date
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\3795791945.py:1: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. df['weekly']=pd.to_datetime(df['order_delivered_customer_date'],
df['yearly']=pd.to_datetime(df['order_delivered_customer_date'])\
.dt.to_period('M')
df['yearly']= df['yearly'].astype(str)
##craeted yearly based on order delivered customer date
df['diff_days']= df['delivered_time']-df['estimate_time']
df['diff_days']= df['diff_days'].dt.days
##finding different days of delivered and estimated deliveries
plt.figure(figsize=(20,10))
sns.lineplot(x='weekly', y='diff_days', data=df,estimator='mean')
plt.xlabel("Weeks")
plt.ylabel("Difference Days")
plt.title("Average Difference Days per Week")
Text(0.5, 1.0, 'Average Difference Days per Week')
## from the above graph we can understand that the earlies deliveries inthe month of october as compare to other moths in the year
## as we can also see that latest deliveries are done in the month of june to august in whole year
len(df['product_id'].unique())
31371
len(df['product_id'].str[-18:].unique())
31371
df['product_id_']=df['product_id'].str[-18:]
plt.figure(figsize=(20,10))
sns.countplot(x='product_id', data=df, palette='gist_earth',
order=df['product_id'].value_counts()[:10]\
.sort_values().index).set_title("Top 10 Products", fontsize=15,
weight='bold')
Text(0.5, 1.0, 'Top 10 Products')
## the above graph shows the top 10 produts , Because of the data information , we can't say the what are the products ,
## but we can assume that which category they belong to
df.groupby(["product_category_name_english"])["product_id_"].count().sort_values(ascending=False).head(10)
product_category_name_english bed_bath_table 11657 health_beauty 9721 sports_leisure 8718 furniture_decor 8534 computers_accessories 7901 housewares 7136 watches_gifts 6006 telephony 4547 garden_tools 4440 auto 4249 Name: product_id_, dtype: int64
plt.figure(figsize=(15,15))
plt.title("Counts of all products")
plt.xticks(rotation='vertical')
sns.countplot(x='product_category_name_english',data=df)
## countplot of products
<AxesSubplot:title={'center':'Counts of all products'}, xlabel='product_category_name_english', ylabel='count'>
group_category=df.groupby(['product_id_','product_category_name_english',])['product_id_']\
.count().sort_values(ascending=False).head(10)
group_category
product_id_ product_category_name_english 7b8ebd4e68314663af furniture_decor 524 2990de24d770e7f83d garden_tools 506 965c36a24e339b6058 bed_bath_table 506 43d311335e499d9c6b garden_tools 401 8016ad823897a372db garden_tools 392 bb87a079a1f1519f73 garden_tools 389 f6b889a5c7c61f2ac4 computers_accessories 343 41585e8d54d6772e08 watches_gifts 322 2203795c972e5804a6 health_beauty 283 5c781a9191c1e95ad7 computers_accessories 275 Name: product_id_, dtype: int64
## bygroupby seeing this the product_id and product_category the most most ordered product is from graden_tools and
## others mostly from bed_bath_table
len(df['seller_id'].unique())
2900
df['seller_id_']=df['seller_id'].str[-6:]
plt.figure(figsize=(20,10))
df['seller_id_'].value_counts()[:10].plot.pie(autopct='%1.1f%%')
plt.title("Top 10 Seller",size=14, weight='bold')
##below pie chart shows the top 10 sellers and top 3 sellers have the closer proportions
Text(0.5, 1.0, 'Top 10 Seller')
##Assuming for the orders' product category of these sellers, we can use 'product category' values.
##Below table shows the Top 10 sellers category, and since they can sell multiple product types, garden tools are the most
##selling product of the best seller.
seller_category= df.groupby(['seller_id_', 'product_category_name_english'])\
['seller_id_'].count().sort_values(ascending=False).head(10)
seller_category
seller_id_ product_category_name_english 523100 garden_tools 1944 493884 bed_bath_table 1652 7e94c0 watches_gifts 1648 dab84a bed_bath_table 1373 0e0bfa furniture_decor 1315 b010ab office_furniture 1303 f13abc telephony 1188 3ad736 cool_stuff 1099 b7556a health_beauty 1091 3b52b2 watches_gifts 1010 Name: seller_id_, dtype: int64
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,15))
group_category.plot.barh(ax=ax1, cmap='summer')
seller_category.plot.barh(ax=ax2, cmap='autumn')
ax1.set_title('Top10 Product', fontweight='bold')
ax2.set_title('Top10 Seller', fontweight='bold')
ax1.set_xlabel('Count', fontsize=15)
ax1.set_ylabel('Product Name', fontsize=15)
ax1.xaxis.set_tick_params(labelsize=12)
ax1.yaxis.set_tick_params(labelsize=15)
ax2.set_xlabel('Count', fontsize=15)
ax2.set_ylabel('Product Name', fontsize=15)
ax2.xaxis.set_tick_params(labelsize=12)
ax2.yaxis.set_tick_params(labelsize=15)
## Above graphs show the top products and top sellers category.
##While most selling product item belongs to the furniture_decor, most seller item belongs to garden tools.
## now we can examine the most selling item weekely purchased items which is 493884 and belong to bed_bath_table
df['order_week']= pd.to_datetime(df['order_purchase_timestamp'],
format='%Y-%m-%d').dt.week
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\209791676.py:1: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. df['order_week']= pd.to_datetime(df['order_purchase_timestamp'],
#plt.figure(figsize=(20,12))
#sns.countplot('order_week', data=df[df['product_id_']==
# '493884'])
## Now we examine the most selling product category by weekly
#Weekly popular items
items_weekly =df.groupby(['order_week','product_category_name_english'])\
['product_category_name_english'].count().sort_values(ascending=False)
#Change to data frame
most_products= items_weekly.reset_index(name='count')
#Find the max value of row
max_selling_products= most_products[most_products['count']
== most_products.groupby(['order_week'])\
['count'].transform(max)]
max_selling_products.head(10)
| order_week | product_category_name_english | count | |
|---|---|---|---|
| 0 | 47 | bed_bath_table | 445 |
| 1 | 32 | bed_bath_table | 388 |
| 3 | 29 | bed_bath_table | 366 |
| 4 | 33 | bed_bath_table | 354 |
| 5 | 23 | bed_bath_table | 349 |
| 7 | 8 | computers_accessories | 338 |
| 9 | 31 | health_beauty | 318 |
| 10 | 19 | health_beauty | 316 |
| 11 | 20 | bed_bath_table | 315 |
| 12 | 9 | computers_accessories | 314 |
max_selling_products['product_category_name_english'].value_counts()
bed_bath_table 35 health_beauty 8 computers_accessories 3 sports_leisure 3 furniture_decor 2 housewares 1 toys 1 Name: product_category_name_english, dtype: int64
print("Number of Unique Products = ",len(df['product_category_name_english'].unique()))
Number of Unique Products = 71
from wordcloud import WordCloud
a = list(df['product_category_name_english'])
word = [x for x in a if str(x) != 'nan']
word = " ".join(word)
wordcloud = WordCloud(width = 1200, height = 800,
background_color ='white',
min_font_size = 10).generate(word)
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
## first we drop the most irrelavant colums that to make data more helpful to understand
payments= df.drop(columns=['product_name_lenght','product_description_lenght',
'product_photos_qty','product_weight_g','product_length_cm',
'product_height_cm','product_width_cm'])
price_details= df.groupby(['order_id','price','product_category_name_english',
'yearly','weekly'])[['freight_value','payment_value']].sum().reset_index()
## As written on the data description,total order value an be calculated by sum of price and freight value
price_details['total_order_value'] = price_details['price'] + price_details['freight_value']
## We can calculate Gross Profit and Profit Margin by payment value and total order value
price_details['gross_profit'] = price_details['payment_value']- price_details['total_order_value']
price_details['profit_margin'] = price_details['gross_profit']/price_details['payment_value']
price_details['profit_margin'] = price_details['profit_margin'].astype('int64')
price_details.sort_values('gross_profit', ascending=False).head(10)
| order_id | price | product_category_name_english | yearly | weekly | freight_value | payment_value | total_order_value | gross_profit | profit_margin | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1422 | 03caa2c082116e1d31e67e9ae3700499 | 1680.00 | fixed_telephony | 2017-10 | 42 | 224.08 | 109312.64 | 1904.08 | 107408.56 | 0 |
| 10198 | 1b15974a0141d54e36626dca3fdc731a | 100.00 | computers_accessories | 2018-03 | 10 | 202.40 | 44048.00 | 302.40 | 43745.60 | 0 |
| 16693 | 2cc9089445046817a7539d90805e6e5a | 989.10 | agro_industry_and_commerce | 2017-12 | 50 | 146.94 | 36489.24 | 1136.04 | 35353.20 | 0 |
| 87914 | e8fa22c3673b1dd17ea315021b1f0f61 | 284.99 | drinks | 2018-05 | 18 | 168.70 | 30186.00 | 453.69 | 29732.31 | 0 |
| 43473 | 736e1922ae60d0d6a89247b851902527 | 1790.00 | fixed_telephony | 2018-07 | 30 | 114.88 | 29099.52 | 1904.88 | 27194.64 | 0 |
| 24977 | 428a2f660dc84138d969ccd69a0ab6d5 | 65.49 | furniture_decor | 2017-12 | 50 | 243.30 | 18384.75 | 308.79 | 18075.96 | 0 |
| 21838 | 3a213fcdfe7d98be74ea0dc05a8b31ae | 108.00 | watches_gifts | 2018-01 | 4 | 186.24 | 17786.88 | 294.24 | 17492.64 | 0 |
| 93511 | f80549a97eb203e1566e026ab66f045b | 137.90 | computers_accessories | 2017-09 | 39 | 388.10 | 17671.00 | 526.00 | 17145.00 | 0 |
| 25560 | 4412d97cb2093633afa85f11db46316c | 120.00 | computers_accessories | 2018-06 | 23 | 83.99 | 15978.65 | 203.99 | 15774.66 | 0 |
| 71708 | be382a9e1ed25128148b97d6bfdb21af | 194.99 | office_furniture | 2017-11 | 47 | 479.28 | 16313.60 | 674.27 | 15639.33 | 0 |
plt.figure(figsize=(25,15))
sns.lineplot(x='yearly',y='gross_profit',
data=price_details[price_details['product_category_name_english']\
=='bed_bath_table'], label='bed_bath_table',color="green")
sns.lineplot(x='yearly', y='gross_profit',
data=price_details[price_details['product_category_name_english']\
=='health_beauty'], label='health_beauty', color="blue")
sns.lineplot(x='yearly', y='gross_profit',
data=price_details[price_details['product_category_name_english']\
=='computers_accessories'], label='computers_accessories', color="red")
sns.lineplot(x='yearly', y='gross_profit',
data=price_details[price_details['product_category_name_english']\
=='moveis_decoracao'], label='home_decoration', color="orange")
sns.lineplot(x='yearly', y='gross_profit',
data=price_details[price_details['product_category_name_english']\
=='furniture_decor'], label='furniture_decor', color="purple")
plt.title("Gross Profit of Top 5 Products (2016-2018)",fontweight='bold')
Text(0.5, 1.0, 'Gross Profit of Top 5 Products (2016-2018)')
#Above graph shows yearly 'gross profit' distribution for the top 5 product category of ordered products and sellers.
#According to data, we can say that informatic_accessories have the highest gross profit in Agust 2017 to November 2017.
#On the other hand, the top 3 products which are bed_bath_table, health_beauty and computers_accessories, gross profits
#are less than 200 units of currency.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112638 entries, 0 to 112637 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112638 non-null object 1 order_item_id 112638 non-null float64 2 product_id 112638 non-null object 3 seller_id 112638 non-null object 4 shipping_limit_date 112638 non-null object 5 price 112638 non-null float64 6 freight_value 112638 non-null float64 7 product_name_lenght 112638 non-null float64 8 product_description_lenght 112638 non-null float64 9 product_photos_qty 112638 non-null float64 10 product_weight_g 112638 non-null object 11 product_length_cm 112638 non-null object 12 product_height_cm 112638 non-null object 13 product_width_cm 112638 non-null object 14 seller_zip_code_prefix 112638 non-null float64 15 seller_city 112638 non-null object 16 seller_state 112638 non-null object 17 customer_id 112638 non-null object 18 order_status 112638 non-null object 19 order_purchase_timestamp 112638 non-null object 20 order_approved_at 112638 non-null object 21 order_delivered_carrier_date 112638 non-null object 22 order_delivered_customer_date 112638 non-null object 23 order_estimated_delivery_date 112638 non-null object 24 customer_unique_id 112638 non-null object 25 customer_zip_code_prefix 112638 non-null int64 26 customer_city 112638 non-null object 27 customer_state 112638 non-null object 28 review_id 112638 non-null object 29 review_score 112638 non-null float64 30 review_comment_title 112638 non-null object 31 review_comment_message 112638 non-null object 32 review_creation_date 112638 non-null object 33 review_answer_timestamp 112638 non-null object 34 payment_sequential 112638 non-null float64 35 payment_type 112638 non-null object 36 payment_installments 112638 non-null float64 37 payment_value 112638 non-null float64 38 product_category_name_english 112638 non-null object 39 seller_lat 112638 non-null float64 40 seller_lng 112638 non-null float64 41 customer_lat 112638 non-null float64 42 customer_lng 112638 non-null float64 43 product_category 112638 non-null object 44 cust_Region 112638 non-null object 45 delivered_time 112638 non-null object 46 estimate_time 112638 non-null object 47 weekly 112638 non-null int64 48 yearly 112638 non-null object 49 diff_days 112638 non-null int64 50 product_id_ 112638 non-null object 51 seller_id_ 112638 non-null object 52 order_week 112638 non-null int64 dtypes: float64(15), int64(4), object(34) memory usage: 45.5+ MB
ax = df[df['payment_type']!='not_defined']['payment_type'].value_counts().plot(kind='pie',figsize = (7,7),autopct='%.2f')
plt.title("Percent of Payment Options Used By Customers")
plt.show()
## this pie chart shows that the percent of payments options used by customers
## mostly used payment mode is credit_card
ax = df[df['payment_type'] == 'credit_card']['payment_installments'].value_counts().plot(kind='bar')
ax.set_title('payment_installments')
plt.show()
## the below graph shows that the customer wants more installments on the products which is costly
ax = sns.catplot('payment_installments','payment_value',kind='bar',
data = df[df['payment_type'] == 'credit_card'])
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
ax = df['review_score'].value_counts().plot(kind='bar',figsize=(8,8))
ax.set_title('Reviews Count')
plt.show()
## this above graphs shows the people giving stars to the produts
## over 60000 people gave 5 stars
l = []
for i in range(len(df)):
if df['review_score'][i] == 4 or df['review_score'][i] == 5:
l.append('positive')
elif df['review_score'][i] == 1 or df['review_score'][i] == 2:
l.append('negative')
else:
l.append('neutral')
df['posorneg'] = l
ax = sns.countplot('posorneg',data = df)
ax.set_title('review Count')
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
df['review_creation_date'] = pd.to_datetime(df['review_creation_date'])
df['review_creation_date_month'] = df['review_creation_date'].dt.to_period('M')
pd.crosstab(df['review_creation_date_month'],df['review_score'])
| review_score | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
|---|---|---|---|---|---|
| review_creation_date_month | |||||
| 2016-10 | 26 | 9 | 14 | 34 | 123 |
| 2016-11 | 23 | 4 | 12 | 21 | 50 |
| 2016-12 | 7 | 0 | 1 | 1 | 10 |
| 2017-01 | 28 | 4 | 29 | 41 | 194 |
| 2017-02 | 103 | 50 | 142 | 323 | 1034 |
| 2017-03 | 237 | 75 | 247 | 584 | 1592 |
| 2017-04 | 210 | 89 | 215 | 432 | 1271 |
| 2017-05 | 403 | 143 | 365 | 860 | 2395 |
| 2017-06 | 331 | 100 | 343 | 777 | 2279 |
| 2017-07 | 342 | 138 | 348 | 767 | 2380 |
| 2017-08 | 405 | 163 | 401 | 962 | 3272 |
| 2017-09 | 388 | 141 | 346 | 967 | 2865 |
| 2017-10 | 537 | 159 | 379 | 1044 | 3023 |
| 2017-11 | 559 | 173 | 518 | 1072 | 3152 |
| 2017-12 | 1376 | 374 | 789 | 1738 | 4788 |
| 2018-01 | 824 | 272 | 628 | 1318 | 3839 |
| 2018-02 | 863 | 243 | 653 | 1287 | 3724 |
| 2018-03 | 1833 | 373 | 858 | 1619 | 4260 |
| 2018-04 | 1260 | 352 | 719 | 1531 | 4401 |
| 2018-05 | 848 | 244 | 638 | 1733 | 5153 |
| 2018-06 | 814 | 267 | 656 | 1482 | 4722 |
| 2018-07 | 518 | 156 | 466 | 1148 | 4100 |
| 2018-08 | 872 | 296 | 714 | 1964 | 6193 |
df = df.sort_values('review_creation_date_month')
g = sns.catplot('review_creation_date_month',kind="count",col = 'review_score',data = df)
g.set_xticklabels(rotation=90)
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
## above plots clearly tells us that with time positive reviews are increasing and negative ones are decreasing
#CORR=df.corr()
#sns.heatmap(CORR,annot=True)
plt.figure(figsize=(15,15))
plt.title("product_category_name V/S freight_value")
plt.xlabel("product_category_name")
plt.ylabel("freight_value")
plt.xticks(rotation='vertical')
sns.scatterplot(df["product_category_name_english"],df["freight_value"])
## relation between product_category_name_english vs freight_value
## stationary and health_beauty has best relatioship
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:title={'center':'product_category_name V/S freight_value'}, xlabel='product_category_name', ylabel='freight_value'>
actual_continuous_cols = ['price','freight_value','payment_value','product_weight_g','product_length_cm','product_height_cm','product_width_cm']
df[actual_continuous_cols].describe()
| price | freight_value | payment_value | |
|---|---|---|---|
| count | 112638.000000 | 112638.000000 | 112638.000000 |
| mean | 119.890695 | 19.994579 | 171.624030 |
| std | 181.346106 | 15.724898 | 264.617854 |
| min | 0.850000 | 0.000000 | 0.000000 |
| 25% | 39.900000 | 13.072500 | 60.772500 |
| 50% | 74.900000 | 16.310000 | 107.900000 |
| 75% | 133.000000 | 21.180000 | 189.157500 |
| max | 6735.000000 | 409.680000 | 13664.080000 |
import plotly.express as px
!pip install geopy
Collecting geopy
Downloading geopy-2.4.0-py3-none-any.whl (125 kB)
-------------------------------------- 125.4/125.4 kB 2.5 MB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
---------------------------------------- 40.3/40.3 kB ? eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.0
import geopy.distance
def dist_cust_seller(lat1,lon1,lat2,lon2):
coords_1 = (lat1, lon1)
coords_2 = (lat2, lon2)
return geopy.distance.geodesic(coords_1, coords_2).km
df_temp = df[['customer_lat','customer_lng','seller_lat','seller_lng']]
df['distance_seller_customer'] = df_temp.apply(lambda x: dist_cust_seller(x.customer_lat,
x.customer_lng,x.seller_lat,x.seller_lng),axis = 1)
sns.distplot(df['distance_seller_customer'])
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='distance_seller_customer', ylabel='Density'>
## most of the customer are in ditsance between 0to 100km
df['distance_seller_customer'].min()
0.0
df['distance_seller_customer'].max()
8652.125673340268
df_distance=df.groupby(['customer_state','seller_state'])['distance_seller_customer'].mean()
sns.kdeplot(df_distance,color='magenta')
<AxesSubplot:xlabel='distance_seller_customer', ylabel='Density'>
pd.crosstab(df['cust_Region'],df['product_category']).plot(kind = 'bar',stacked = True)
<AxesSubplot:xlabel='cust_Region'>
## the above shows the products that are avaliable in each region most of the products are avaliable in southeast region
px.scatter_3d(df,x='cust_Region',y='product_category',z ='cust_Region',color='product_category')
plt.figure(figsize=(20,10))
df['product_category'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Product Categories",size=14, weight='bold')
Text(0.5, 1.0, 'Product Categories')
## the pie charts describes the product_categories most products from electronics and the least products are food and drinks
##the entertainment,health and beauty and houseandgarden ratios are closed in range
df['total_price'] = df[['price','freight_value']].sum(axis =1)
product_stats = df.groupby('product_id').agg({'order_item_id': 'sum', 'total_price': 'mean'}).reset_index()
# Plot the data
plt.figure(figsize=(10, 6)) # Adjust the figure size as needed
plt.scatter( product_stats['total_price'],product_stats['order_item_id'], alpha=0.5)
plt.ylabel('Number of Items Sold')
plt.xlabel('Total Price')
plt.title('Number of Items Sold vs. Total Price for Each Product')
plt.grid(True)
plt.show()
df['profit_loss'] = df['payment_value'] - df['total_price']
df[df['profit_loss'] == df['profit_loss'].min()][['customer_id','order_id','order_item_id',
'product_category_name_english','price','freight_value','payment_value',
'payment_type','payment_installments','order_status','review_score']]
| customer_id | order_id | order_item_id | product_category_name_english | price | freight_value | payment_value | payment_type | payment_installments | order_status | review_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 42006 | eb7a157e8da9c488cd4ddc48711f1097 | 9de73f3e6157169ad6c32b9f313c7dcb | 1.0 | baby | 3899.0 | 135.44 | 135.44 | credit_card | 1.0 | delivered | 5.0 |
df[df['profit_loss'] == df['profit_loss'].max()][['customer_id','order_id','order_item_id','product_category_name_english',
'price','freight_value','payment_value','payment_type',
'payment_installments','order_status','review_score','profit_loss']]
| customer_id | order_id | order_item_id | product_category_name_english | price | freight_value | payment_value | payment_type | payment_installments | order_status | review_score | profit_loss | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 78670 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 2.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78671 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 3.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78672 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 4.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78673 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 5.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78674 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 6.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78675 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 7.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78676 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 8.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
| 78669 | 1617b1357756262bfa56ab541c47bc16 | 03caa2c082116e1d31e67e9ae3700499 | 1.0 | fixed_telephony | 1680.0 | 28.01 | 13664.08 | credit_card | 1.0 | delivered | 1.0 | 11956.07 |
px.scatter_3d(df,x='total_price',y='payment_value',z = 'profit_loss',color='product_category')
px.scatter_3d(df,x = 'distance_seller_customer',y = 'freight_value', z= 'payment_value',color = 'product_category')
actual_continuous_cols.append('distance_seller_customer')
sns.pairplot(df[actual_continuous_cols])
<seaborn.axisgrid.PairGrid at 0x1c40bf91b50>
CORR = df[actual_continuous_cols].corr()
sns.heatmap(CORR,annot = True)
<AxesSubplot:>
#We can see there is a high correlation between product_weight and freight_value, payment_value and price.
#Also between product dimensions.